{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import gc\n",
    "import gc\n",
    "import os\n",
    "import random\n",
    "import lightgbm as lgb\n",
    "from pandas.api.types import is_datetime64_any_dtype as is_datetime\n",
    "from pandas.api.types import is_categorical_dtype\n",
    "from sklearn.metrics import mean_squared_error\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "import matplotlib.pyplot as plt\n",
    "from sklearn.model_selection import KFold\n",
    "import datetime\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# normal reduce memory usage function\n",
    "def reduce_mem_usage(df, use_float16 = False):\n",
    "    \"\"\"\n",
    "    Iterate through all the columns of a dataframe and modify the data type to reduce memory usage.        \n",
    "    \"\"\"\n",
    "    \n",
    "    start_mem = df.memory_usage().sum() / 1024**2\n",
    "    print(\"Memory usage of dataframe is {:.2f} MB\".format(start_mem))\n",
    "    \n",
    "    for col in df.columns:\n",
    "        if is_datetime(df[col]) or is_categorical_dtype(df[col]):\n",
    "            continue\n",
    "        col_type = df[col].dtype\n",
    "        \n",
    "        if col_type != object:\n",
    "            c_min = df[col].min()\n",
    "            c_max = df[col].max()\n",
    "            if str(col_type)[:3] == \"int\":\n",
    "                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:\n",
    "                    df[col] = df[col].astype(np.int8)\n",
    "                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:\n",
    "                    df[col] = df[col].astype(np.int16)\n",
    "                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:\n",
    "                    df[col] = df[col].astype(np.int32)\n",
    "                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:\n",
    "                    df[col] = df[col].astype(np.int64)  \n",
    "            else:\n",
    "                if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:\n",
    "                    df[col] = df[col].astype(np.float16)\n",
    "                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:\n",
    "                    df[col] = df[col].astype(np.float32)\n",
    "                else:\n",
    "                    df[col] = df[col].astype(np.float64)\n",
    "        else:\n",
    "            df[col] = df[col].astype(\"category\")\n",
    "\n",
    "    end_mem = df.memory_usage().sum() / 1024**2\n",
    "    print(\"Memory usage after optimization is: {:.2f} MB\".format(end_mem))\n",
    "    print(\"Decreased by {:.1f}%\".format(100 * (start_mem - end_mem) / start_mem))\n",
    "    \n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "# add timestamp and reduce memory\n",
    "def df_process(df, reduce = True, use_float16 = False):\n",
    "    try:\n",
    "        df[\"timestamp\"] = pd.to_datetime(df[\"timestamp\"], format = '%Y-%m-%d %H:%M:%S')\n",
    "    except:\n",
    "        print('There is no timestamp column')\n",
    "    # reduce memory\n",
    "    if reduce:\n",
    "        df = reduce_mem_usage(df, use_float16 = use_float16)\n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "# fill weather by group of site day and month and use the mean to fill\n",
    "def fill_weather_dataset(weather_df):\n",
    "    \n",
    "    # Find Missing Dates\n",
    "    time_format = \"%Y-%m-%d %H:%M:%S\"\n",
    "    start_date = datetime.datetime.strptime(weather_df['timestamp'].min(),time_format)\n",
    "    end_date = datetime.datetime.strptime(weather_df['timestamp'].max(),time_format)\n",
    "    total_hours = int(((end_date - start_date).total_seconds() + 3600) / 3600)\n",
    "    hours_list = [(end_date - datetime.timedelta(hours=x)).strftime(time_format) for x in range(total_hours)]\n",
    "\n",
    "    missing_hours = []\n",
    "    for site_id in range(16):\n",
    "        site_hours = np.array(weather_df[weather_df['site_id'] == site_id]['timestamp'])\n",
    "        new_rows = pd.DataFrame(np.setdiff1d(hours_list,site_hours),columns=['timestamp'])\n",
    "        new_rows['site_id'] = site_id\n",
    "        weather_df = pd.concat([weather_df,new_rows])\n",
    "\n",
    "        weather_df = weather_df.reset_index(drop=True)           \n",
    "\n",
    "    # Add new Features\n",
    "    weather_df[\"datetime\"] = pd.to_datetime(weather_df[\"timestamp\"])\n",
    "    weather_df[\"day\"] = weather_df[\"datetime\"].dt.day\n",
    "    weather_df[\"week\"] = weather_df[\"datetime\"].dt.week\n",
    "    weather_df[\"month\"] = weather_df[\"datetime\"].dt.month\n",
    "    \n",
    "\n",
    "    weather_df[\"timestamp\"] = pd.to_datetime(weather_df[\"timestamp\"], format='%Y-%m-%d %H:%M:%S')\n",
    "\n",
    "    weather_df['local_time'] = weather_df['timestamp']\n",
    "    \n",
    "    # Reset Index for Fast Update\n",
    "    weather_df = weather_df.set_index(['site_id','day','month'])\n",
    "\n",
    "    air_temperature_filler = pd.DataFrame(weather_df.groupby(['site_id','day','month'])['air_temperature'].mean(),columns=[\"air_temperature\"])\n",
    "    weather_df.update(air_temperature_filler,overwrite=False)\n",
    "\n",
    "    # Step 1\n",
    "    cloud_coverage_filler = weather_df.groupby(['site_id','day','month'])['cloud_coverage'].mean()\n",
    "    # Step 2\n",
    "    cloud_coverage_filler = pd.DataFrame(cloud_coverage_filler.fillna(method='ffill'),columns=[\"cloud_coverage\"])\n",
    "\n",
    "    weather_df.update(cloud_coverage_filler,overwrite=False)\n",
    "\n",
    "    due_temperature_filler = pd.DataFrame(weather_df.groupby(['site_id','day','month'])['dew_temperature'].mean(),columns=[\"dew_temperature\"])\n",
    "    weather_df.update(due_temperature_filler,overwrite=False)\n",
    "\n",
    "    # Step 1\n",
    "    sea_level_filler = weather_df.groupby(['site_id','day','month'])['sea_level_pressure'].mean()\n",
    "    # Step 2\n",
    "    sea_level_filler = pd.DataFrame(sea_level_filler.fillna(method='ffill'),columns=['sea_level_pressure'])\n",
    "\n",
    "    weather_df.update(sea_level_filler,overwrite=False)\n",
    "\n",
    "    wind_direction_filler =  pd.DataFrame(weather_df.groupby(['site_id','day','month'])['wind_direction'].mean(),columns=['wind_direction'])\n",
    "    weather_df.update(wind_direction_filler,overwrite=False)\n",
    "\n",
    "    wind_speed_filler =  pd.DataFrame(weather_df.groupby(['site_id','day','month'])['wind_speed'].mean(),columns=['wind_speed'])\n",
    "    weather_df.update(wind_speed_filler,overwrite=False)\n",
    "\n",
    "    # Step 1\n",
    "    precip_depth_filler = weather_df.groupby(['site_id','day','month'])['precip_depth_1_hr'].mean()\n",
    "    # Step 2\n",
    "    precip_depth_filler = pd.DataFrame(precip_depth_filler.fillna(method='ffill'),columns=['precip_depth_1_hr'])\n",
    "\n",
    "    weather_df.update(precip_depth_filler,overwrite=False)\n",
    "\n",
    "    weather_df = weather_df.reset_index()\n",
    "    weather_df = weather_df.drop(['datetime','day','week','month'],axis=1)\n",
    "        \n",
    "    return weather_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/local/home/ningzesun/.local/lib/python3.6/site-packages/ipykernel_launcher.py:15: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n",
      "of pandas will change to not sort by default.\n",
      "\n",
      "To accept the future behavior, pass 'sort=False'.\n",
      "\n",
      "To retain the current behavior and silence the warning, pass 'sort=True'.\n",
      "\n",
      "  from ipykernel import kernelapp as app\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Memory usage of dataframe is 10.72 MB\n",
      "Memory usage after optimization is: 4.16 MB\n",
      "Decreased by 61.2%\n"
     ]
    }
   ],
   "source": [
    "df_train_init_weather = df_process(fill_weather_dataset(pd.read_csv('../Resources/weather_train.csv')), use_float16=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Memory usage of dataframe is 616.95 MB\n",
      "Memory usage after optimization is: 289.19 MB\n",
      "Decreased by 53.1%\n",
      "There is no timestamp column\n",
      "Memory usage of dataframe is 0.07 MB\n",
      "Memory usage after optimization is: 0.02 MB\n",
      "Decreased by 73.8%\n"
     ]
    }
   ],
   "source": [
    "# process the data\n",
    "df_train = df_process(pd.read_csv('../Resources/train.csv'), use_float16 = True)\n",
    "df_building = df_process(pd.read_csv('../Resources/building_metadata.csv'), use_float16 = True)\n",
    "# df_train_weather = df_process(pd.read_csv('../Large_output/csv/train_weather_filled_1.csv'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_train = df_train.loc[df_train['building_id'] != 1099,:]\n",
    "df_train = df_train.query('not (building_id <= 104 & meter == 0 & timestamp <= \"2016-05-20\")')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_train_merge = df_train.merge(df_building, on='building_id', how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [],
   "source": [
    "# add the hodliday and log the square feet and use label encoder to change primary_use to numeric\n",
    "def prepare_data(X, test=False, has_month = True):\n",
    "    X.sort_values(\"timestamp\")\n",
    "    X.reset_index(drop=True)\n",
    "    gc.collect()\n",
    "    X.local_time = pd.to_datetime(X.local_time, format=\"%Y-%m-%d %H:%M:%S\")\n",
    "    holidays = [\"2016-01-01\", \"2016-01-18\", \"2016-02-15\", \"2016-05-30\", \"2016-07-04\",\n",
    "                    \"2016-09-05\", \"2016-10-10\", \"2016-11-11\", \"2016-11-24\", \"2016-12-26\",\n",
    "                    \"2017-01-02\", \"2017-01-16\", \"2017-02-20\", \"2017-05-29\", \"2017-07-04\",\n",
    "                    \"2017-09-04\", \"2017-10-09\", \"2017-11-10\", \"2017-11-23\", \"2017-12-25\",\n",
    "                    \"2018-01-01\", \"2018-01-15\", \"2018-02-19\", \"2018-05-28\", \"2018-07-04\",\n",
    "                    \"2018-09-03\", \"2018-10-08\", \"2018-11-12\", \"2018-11-22\", \"2018-12-25\",\n",
    "                    \"2019-01-01\"]    \n",
    "    X[\"is_holiday\"] = (X.local_time.isin(holidays)).astype(int)\n",
    "    le = LabelEncoder()\n",
    "    X['primary_use'] = le.fit_transform(X['primary_use'])\n",
    "    X.timestamp = pd.to_datetime(X.timestamp, format=\"%Y-%m-%d %H:%M:%S\")\n",
    "    X[\"hour\"] = X.local_time.dt.hour\n",
    "    X[\"weekend\"] = X.local_time.dt.weekday\n",
    "    X[\"month\"] = X.local_time.dt.month\n",
    "    X['square_feet']=np.log1p(X['square_feet'])\n",
    "    drop_features = ['timestamp', 'floor_count', 'year_built', 'local_time',\n",
    "                     'wind_speed', 'wind_direction', 'sea_level_pressure']\n",
    "    if not has_month:\n",
    "        drop_features.append('month')\n",
    "    X.drop(drop_features, axis = 1, inplace = True)\n",
    "    if test:\n",
    "        row_ids = X.row_id\n",
    "        X.drop(\"row_id\", axis=1, inplace=True)\n",
    "        return X, row_ids\n",
    "    else:\n",
    "        y = np.log1p(X.meter_reading)\n",
    "        X.drop(\"meter_reading\", axis=1, inplace=True)\n",
    "        return X, y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0           3.190624\n",
      "1           0.318163\n",
      "2           0.000000\n",
      "3           5.171529\n",
      "4           4.524668\n",
      "              ...   \n",
      "19852315    2.277267\n",
      "19852316    1.762159\n",
      "19852317    0.000000\n",
      "19852318    5.078761\n",
      "19852319    1.348073\n",
      "Name: meter_reading, Length: 19852320, dtype: float32\n"
     ]
    }
   ],
   "source": [
    "df_train_1 = pd.merge(df_train_merge, df_train_init_weather,\n",
    "                      on=['site_id', 'timestamp'], how='left')\n",
    "X, y = prepare_data(df_train_1)\n",
    "X.to_csv('../../../Large_output/init_train_merge.csv')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
